import urllib.request
from bs4 import BeautifulSoup
import time

print("Start Wikidownload")

#config
baseadress = "http://dumps.wikimedia.org/"
rootadress = "backup-index.html"


# Uebersichtsseite
root = urllib.request.urlopen(baseadress + rootadress)
print(root)
rootcontent= root.read()
#print(rootcontent)

soup = BeautifulSoup(rootcontent)

#print(soup.prettify())

linklist = soup.find_all('a')
linklist = list(filter(lambda x: x.get('href').find("2013") != -1, linklist))
print(linklist)
print(len(linklist))

#debug: nur 1 Link fuer naechsten Schritt    
#suburllist = list(filter(lambda x: x.get('href').find("dewikibooks/") != -1, linklist))
suburllist = linklist
print(suburllist)
print(len(suburllist))



# Unterseite je Wikiart und Sprache
#subcontent = list(map(lambda x: urllib.request.urlopen(baseadress + x.get('href')).read(), suburllist))
subcontent = [];
for sublink in suburllist:
    print(sublink)
    subcontent.append(urllib.request.urlopen(baseadress + sublink.get('href')).read())
    time.sleep(10)
#print(subcontent)

sublinks = list(map(lambda x: BeautifulSoup(x).find_all('a'), subcontent))
sublinks = list(map(lambda y: list(filter(lambda x: x.get('href').find("pages-meta-history.xml.7z") != -1, y)), sublinks))
print(sublinks)

sublinks_raw = list(map(lambda y: list(map(lambda x: x.get('href'), y)), sublinks))

for sublist in sublinks_raw:
    for sublink in sublist:
        print(sublink)    